suppressPackageStartupMessages({
  import(rpkgs)
})

import(run)

Baseline model predict target = average value for the asset

Model

modelName = "baseline-avg"

assets = getAllAssets()
## 2021-11-30 02:04:09 INFO::Sourcing ALL_ASSETS
runModel = \() {
  doRun(
    name = modelName,
    trnAmt = 60 * 24 * 7 * 1, # 1 week of data, chosen arbitrarily
    tstAmt = 60 * 24 * 7 * 2, # 2 weeks, submission period will provide new data every 2 weeks
    assets = assets[,asset_id],

    makeData = \(env, minDate, maxDate, assets, ...) {
      selectStmt = glue('
        SELECT ts, asset_id, asset_name, target
        FROM trn
        WHERE (ts BETWEEN $1 AND $2)
          AND asset_id IN ({paste(assets, collapse = ", ")})
      ')

      df = getQuery(selectStmt, params = list(minDate, maxDate))
      
      env$x = df[,.(ts, asset_id, asset_name)]
      env$y = df[,.(target)]
    },

    trainModel = \(model, trn, ...) {
      # give the model a description
      model$description = 'mean of target'

      model$getKeyForAsset = \(a) paste("asset-", a)
      for (a in unique(trn$x[,asset_id])) {
        idx = trn$x[,asset_id] == a
        key = model$getKeyForAsset(a)
        prediction = mean(trn$y[idx,target], na.rm = TRUE)
        if (is.na(prediction)) prediction = 0
        model[[key]] = prediction
      }
    },

    predictModel = \(model, tst, ...) {
      # use advanced machine learning algorithm to predict crypto movement
      tst$yhat = vector(mode = "numeric", length = nrow(tst$x))
      tst$yhat[1:length(tst$yhat)] <- NA
  
      for (a in unique(tst$x[,asset_id])) {
        idx = tst$x[,asset_id] == a
        key = model$getKeyForAsset(a)
        tst$yhat[idx] <- model[[key]]
      }
    }
  )
}

Method

Same method as was used for the baseline “target = 0” model.

numSamples = 610

set.seed(205794)

for (i in 1:numSamples) {
  results = runModel()
}

Plots

We can examine the results from the last run, as a sanity-check.

df = results$tst$x
df$y = results$tst$y$target
df$yhat = results$tst$yhat

set.seed(68420)

# sample of data
plotStart = sample(df[,ts], 1)
plotEnd = plotStart + as.difftime(200, units = "mins")

assets[sample(nrow(assets), 2),asset_name] |>
  lapply(\(asset) {
    df[asset_name == asset & ts > plotStart & ts < plotEnd] |>
      melt(id.vars = c("ts", "asset_name"), measure.vars = c("y", "yhat")) |>
      ggplot(aes(ts, value, colour = variable)) +
      geom_line() +
      facet_wrap(~asset_name, ncol = 1)
  }) |>
  print()
## Warning: Removed 7 row(s) containing missing values (geom_path).

The competition metric is correlation between your predictions and the targets.

Visualising this:

## Warning: Removed 62809 rows containing non-finite values (stat_bin2d).

Remember, that’s just for 1 run; we repeated that experiment 610 times!

Evaluation

scores = getQuery('SELECT * FROM metrics WHERE name = $1', params = list(modelName))
DT::datatable(scores[,.(run_id, corr, mae, aae, rmse)])

Median correlation: 0.0018818.